Packages

Install any packages you might be missing:

library(dplyr)
library(ggplot2)
library(leaflet) 
library(sf) # handle simple features objects
library(units) # set_units() for converting units objects
library(spdep) # areal analysis

Areal Data and Maps

Set working directory where shapefiles are found

data.dir ="/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data"

Areal data are often stored as shapefiles (.shp). Shapefiles usually come in a set of at least 4 files:

  • .shp: main shapefile
  • .shx: spatial index
  • .dbf: dBase file which stores the non-spatial columns
  • .prj: definition of the projection of the spatial data

For example, the shapefile ca_counties.shp and its related files consist of polygons for California counties. We read shapefiles with the st_read() function from the sf package:

ca_counties = st_read(paste0(data.dir,"/shapes","/ca_counties.shp"))
## Reading layer `ca_counties' from data source `/Users/mf/Dropbox (University of Southern California)/Courses/PM569/Fall 2019/data/shapes/ca_counties.shp' using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 2 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.482 ymin: 32.52884 xmax: -114.1312 ymax: 42.00951
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
ca_counties
## Simple feature collection with 58 features and 2 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.482 ymin: 32.52884 xmax: -114.1312 ymax: 42.00951
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
##          county population                       geometry
## 1       Alameda    1666753 MULTIPOLYGON (((-122.2809 3...
## 2        Alpine       1101 MULTIPOLYGON (((-120.0733 3...
## 3        Amador      39383 MULTIPOLYGON (((-121.0273 3...
## 4         Butte     231256 MULTIPOLYGON (((-121.8565 3...
## 5     Calaveras      45602 MULTIPOLYGON (((-120.6309 3...
## 6        Colusa      21627 MULTIPOLYGON (((-122.0802 3...
## 7  Contra Costa    1150215 MULTIPOLYGON (((-122.2677 3...
## 8     Del Norte      27828 MULTIPOLYGON (((-124.3161 4...
## 9     El Dorado     190678 MULTIPOLYGON (((-121.1186 3...
## 10       Fresno     994400 MULTIPOLYGON (((-119.7054 3...
ca_counties %>% class()
## [1] "sf"         "data.frame"

We can use leaflet to map areal data:

ca_counties %>% 
  leaflet() %>% 
  addProviderTiles('Wikimedia') %>% 
  addPolygons(weight=1, fillOpacity=.25, label=~county)

Shapefiles also come with data (in the .dbf) They aggregate/summarize values (e.g., population, income) over geographic divisions (e.g., countries, states, counties, census blocks, etc.).

pop.pal = colorNumeric(c('beige','brown','brown'),
                       domain=ca_counties$population)
ca_counties %>% 
  leaflet() %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addPolygons(fillColor=~pop.pal(population), weight=.5, fillOpacity=.9,
              label=~paste0(county, ': ', population))
nc<-st_read(system.file("shape/nc.shp", package = "sf"), stringsAsFactors = FALSE)
## Reading layer `nc' from data source `/Library/Frameworks/R.framework/Versions/3.5/Resources/library/sf/shape/nc.shp' using driver `ESRI Shapefile'
## Simple feature collection with 100 features and 14 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
## epsg (SRID):    4267
## proj4string:    +proj=longlat +datum=NAD27 +no_defs
nc<-st_transform(nc,4326) # need to transform so that shapefile has WGS
nc %>% class()
## [1] "sf"         "data.frame"
nc
## Simple feature collection with 100 features and 14 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
##     AREA PERIMETER CNTY_ CNTY_ID        NAME  FIPS FIPSNO CRESS_ID BIR74
## 1  0.114     1.442  1825    1825        Ashe 37009  37009        5  1091
## 2  0.061     1.231  1827    1827   Alleghany 37005  37005        3   487
## 3  0.143     1.630  1828    1828       Surry 37171  37171       86  3188
## 4  0.070     2.968  1831    1831   Currituck 37053  37053       27   508
## 5  0.153     2.206  1832    1832 Northampton 37131  37131       66  1421
## 6  0.097     1.670  1833    1833    Hertford 37091  37091       46  1452
## 7  0.062     1.547  1834    1834      Camden 37029  37029       15   286
## 8  0.091     1.284  1835    1835       Gates 37073  37073       37   420
## 9  0.118     1.421  1836    1836      Warren 37185  37185       93   968
## 10 0.124     1.428  1837    1837      Stokes 37169  37169       85  1612
##    SID74 NWBIR74 BIR79 SID79 NWBIR79                       geometry
## 1      1      10  1364     0      19 MULTIPOLYGON (((-81.47276 3...
## 2      0      10   542     3      12 MULTIPOLYGON (((-81.23989 3...
## 3      5     208  3616     6     260 MULTIPOLYGON (((-80.45634 3...
## 4      1     123   830     2     145 MULTIPOLYGON (((-76.00897 3...
## 5      9    1066  1606     3    1197 MULTIPOLYGON (((-77.21767 3...
## 6      7     954  1838     5    1237 MULTIPOLYGON (((-76.74506 3...
## 7      0     115   350     2     139 MULTIPOLYGON (((-76.00897 3...
## 8      0     254   594     2     371 MULTIPOLYGON (((-76.56251 3...
## 9      4     748  1190     2     844 MULTIPOLYGON (((-78.30876 3...
## 10     1     160  2038     5     176 MULTIPOLYGON (((-80.02567 3...
plot(nc)

plot(nc[13])

sids.pal = colorNumeric(c('blue','purple','pink','yellow'),
                       domain=nc$SID79)
nc %>% 
  leaflet() %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addPolygons(fillColor=~sids.pal(SID79), weight=.5, fillOpacity=.9,
              label=~paste0(NAME, ': ', BIR79)) %>%
  addLegend("bottomleft", pal = sids.pal, values = ~SID79, title = "SIDS79 count", opacity = 1)
# try with sids rate

Areal analysis with spdep: Defining Neighbors

  • The main R package for areal analysis is spdep(). We need to create a sp object from the sf object for making connected graphs

  • We can then look at the connectivity of counties: shared border, nearest neighbours, and distance based.

  • For shared border connections, use poly2nb() and pick between queen and rook.

  • For k-nearest neighbours use knearneigh() to create matrix with index for the regions belonging to knn, and knn2nb() to create neighbourhood list.

  • For distance based connections, use dnearneigh().

# convert sf to spatial to use spdep
nc_sp <- sf:::as_Spatial(nc$geom)

# Shared border neighbours use poly2nb()
# Queen (default) and rook
sids_nb_queen<-poly2nb(nc_sp,queen=TRUE)
plot(nc_sp, main="Queen")
plot(sids_nb_queen,coordinates(nc_sp), add=TRUE,col="green")

#sharing full borders
sids_nb_rook<-poly2nb(nc_sp, queen=FALSE)
plot(nc_sp, main="Rook")
plot(sids_nb_rook,coordinates(nc_sp), add=TRUE,col="blue")

# see number of neighbours
card(sids_nb_rook)
##   [1] 3 3 5 2 4 3 3 5 4 3 4 4 5 4 3 6 3 8 4 3 2 5 5 5 6 5 6 5 5 5 5 3 5 6 4
##  [36] 6 6 3 9 5 4 6 6 5 2 6 6 8 6 5 7 5 6 6 5 2 6 4 4 3 6 6 7 4 5 4 8 5 5 6
##  [71] 5 4 3 6 3 3 2 5 7 2 3 6 5 4 4 4 4 6 4 2 6 3 4 5 3 5 7 4 2 3
summary(card(sids_nb_queen))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0     4.0     5.0     4.9     6.0     9.0
diffs<-diffnb(sids_nb_queen, sids_nb_rook)

plot(nc_sp, main="Difference queen, rook")
plot(diffs,coordinates(nc_sp),add=TRUE, col="red")

# k nearest neighbours
# knearneigh() creates matrix with index for the regions belonging to knn
# knn2nb() creates neighbourhood list
sids_kn1<-knn2nb(knearneigh(coordinates(nc_sp), k=1, RANN=FALSE))
sids_kn2<-knn2nb(knearneigh(coordinates(nc_sp), k=2, RANN=FALSE))
sids_kn4<-knn2nb(knearneigh(coordinates(nc_sp), k=4, RANN=FALSE))


plot(nc_sp, main="knn-1")
plot(sids_kn1, coordinates(nc_sp),add=TRUE,col="blue")

plot(nc_sp, main="knn-2")
plot(sids_kn2, coordinates(nc_sp),add=TRUE,col="green")

plot(nc_sp, main="knn-4")
plot(sids_kn4, coordinates(nc_sp), add=TRUE,col="black")

# difference between knn1 and knn2
diffs_knn<-diffnb(sids_kn1, sids_kn2)

plot(nc_sp, main="Diff knn-1, knn-2")
plot(diffs_knn, coordinates(nc_sp),add=TRUE,col="red")

# distance neighbours (still in degrees)
ndist<-unlist(nbdists(sids_kn1, coordinates(nc_sp)))
summary(ndist)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1197  0.2582  0.2964  0.2910  0.3239  0.4226
# distance based neighbours
# median distance of knn=1 is about 30km
# creating neighbours by epsilon=30km
# d1=smallest distance, usually 0, d2=distance you want to go from the point
median_dist_kn1<-median(ndist)
sids_dist1<-dnearneigh(coordinates(nc_sp), d1=0, d2=median_dist_kn1)
sids_dist1b<-dnearneigh(coordinates(nc_sp), d1=0, d2=0.43)


plot(nc_sp, main="Distance neighbours")
plot(sids_dist1, coordinates(nc_sp), add=T,col="blue")
plot(sids_dist1b, coordinates(nc_sp), add=T,col="blue")

# 0.25 to maximum distance
max_dist_kn1<-max(ndist)
sids_dist2<-dnearneigh(coordinates(nc_sp), d1=0.25*max_dist_kn1, d2=1*max_dist_kn1)

plot(nc_sp, main="Distance neighbours")
plot(sids_dist2, coordinates(nc_sp), add=T,col="green")

sids_dist3<-dnearneigh(coordinates(nc_sp), d1=max_dist_kn1, d2=1.5*max_dist_kn1)

plot(nc_sp, main="Distance neighbours")
plot(sids_dist3, coordinates(nc_sp), add=T,col="purple")

Areal analysis with spdep: Defining Weights

  • We use the neibourhood connections to define weight matrices that link information between blocks.

  • We decide which neighbouhood connections we want (border, knn, or distance) and convert to weights. We use nb2listw() to convert the neighbourhood to weights.

  • Weights matrix can be W=row standardized, B=binary, C=globally standardized.

sids_kn2_b<-nb2listw(sids_kn2, style="B")
sids_kn2_b
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100 
## Number of nonzero links: 200 
## Percentage nonzero weights: 2 
## Average number of links: 2 
## Non-symmetric neighbours list
## 
## Weights style: B 
## Weights constants summary:
##     n    nn  S0  S1   S2
## B 100 10000 200 352 1686
summary(unlist(sids_kn2_b$weights))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1       1       1       1       1       1
sids_kn2_w<-nb2listw(sids_kn2, style="W")
sids_kn2_w
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100 
## Number of nonzero links: 200 
## Percentage nonzero weights: 2 
## Average number of links: 2 
## Non-symmetric neighbours list
## 
## Weights style: W 
## Weights constants summary:
##     n    nn  S0 S1    S2
## W 100 10000 100 88 421.5
summary(unlist(sids_kn2_w$weights))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.5     0.5     0.5     0.5     0.5     0.5
  • Neighbourhoods where there are some blocks that have no connections can be problematic. This is common when you use distance neighbourhoods (large counties may not connect to a neighbour within a smaller distance).

  • Make sure to use zero.policy=TRUE in these cases

sids_dist1_w<-nb2listw(sids_dist1, style="W", zero.policy = TRUE)
  • Row standardized weights for queen
sids_queen_w<-nb2listw(sids_nb_queen, style="W")
sids_queen_w
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100 
## Number of nonzero links: 490 
## Percentage nonzero weights: 4.9 
## Average number of links: 4.9 
## 
## Weights style: W 
## Weights constants summary:
##     n    nn  S0       S1       S2
## W 100 10000 100 44.65023 410.4746
summary(unlist(sids_queen_w$weights))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1111  0.1429  0.1667  0.2041  0.2500  0.5000
  • Minmax weights (divides weights by min-max row sums and max column sums)
sids_dist1_mm<-nb2listw(sids_dist2, style="minmax")
sids_dist1_mm
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100 
## Number of nonzero links: 306 
## Percentage nonzero weights: 3.06 
## Average number of links: 3.06 
## 
## Weights style: minmax 
## Weights constants summary:
##          n    nn S0 S1       S2
## minmax 100 10000 51 17 118.4444
summary(unlist(sids_dist1_mm$weights))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1667  0.1667  0.1667  0.1667  0.1667  0.1667
  • S is variance stabilizing coding
sids_dist1_s<-nb2listw(sids_dist2, style="S")
sids_dist1_s
## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 100 
## Number of nonzero links: 306 
## Percentage nonzero weights: 3.06 
## Average number of links: 3.06 
## 
## Weights style: S 
## Weights constants summary:
##     n    nn  S0       S1       S2
## S 100 10000 100 67.06724 428.6124
summary(unlist(sids_dist1_s$weights))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.2382  0.2917  0.2917  0.3268  0.3368  0.5834
  • Inverse distance weighting, take distances from knn and apply \(1/x^2\) weights.

  • Need to define the weights first and then include in nb2listw

dists<-nbdists(sids_kn2, coordinates(nc_sp))
# inverse distance weighted weights
idw <- lapply(dists, function(x) 1/(x/2))
sids_idw_dist_w <- nb2listw(sids_kn2, glist = idw, style = "W")

Moran’s I

  • Moran’s I is a global test of association, whereby we test “is there spatial similarity in a variable given a particular adjacency matrix”?

  • We need to test a variable, in this case we will use SIDS rate (divide SIDS counts by birth rate).

  • We need an adjacency matrix, let’s choose the knn-2 that we turned into weights.

  • Note for these methods to get the data associated with the counties, we need the sf object (in this case, nc).

moranSIDS<-moran.test(nc$SID79/nc$BIR79,sids_kn2_w)
moranSIDS
## 
##  Moran I test under randomisation
## 
## data:  nc$SID79/nc$BIR79  
## weights: sids_kn2_w    
## 
## Moran I statistic standard deviate = 2.4465, p-value = 0.007213
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.214682382      -0.010101010       0.008442014

Geary’s c

  • Geary’s c is another global test of association.
gearySIDS<-geary.test(nc$SID79,sids_kn2_w)
gearySIDS
## 
##  Geary C test under randomisation
## 
## data:  nc$SID79 
## weights: sids_kn2_w 
## 
## Geary C statistic standard deviate = 2.7567, p-value = 0.002919
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic       Expectation          Variance 
##        0.68915805        1.00000000        0.01271429